Import Libraries#

We are importing different libraries for the data processing and analysis

import pandas as pd
import geopandas as gpd
import datetime
import jenkspy
import seaborn as sns
import numpy as np
from shapely.geometry import Point, LineString, Polygon, MultiPolygon
import matplotlib.pyplot as plt

Reading and Exploring Data#

The dataset was provided with the task as a csv file with documentation describing data in different columns. It contains data as output from LiDAR-based Multiple Object Tracking system. In the following steps we will explore the dataset.

Reading dataset and getting general info about data in different columns, showing the head part of the dataframe:

data = pd.read_csv('full_intersection_15.csv', sep=";")
data.info()
data.head()
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 data = pd.read_csv('full_intersection_15.csv', sep=";")
      2 data.info()
      3 data.head()

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    946     defaults={"delimiter": ","},
    947 )
    948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
    602 _validate_names(kwds.get("names", None))
    604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
    607 if chunksize or iterator:
    608     return parser

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
   1439     self.options["has_index_names"] = kwds["has_index_names"]
   1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
   1733     if "b" not in mode:
   1734         mode += "b"
-> 1735 self.handles = get_handle(
   1736     f,
   1737     mode,
   1738     encoding=self.options.get("encoding", None),
   1739     compression=self.options.get("compression", None),
   1740     memory_map=self.options.get("memory_map", False),
   1741     is_text=is_text,
   1742     errors=self.options.get("encoding_errors", "strict"),
   1743     storage_options=self.options.get("storage_options", None),
   1744 )
   1745 assert self.handles is not None
   1746 f = self.handles.handle

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    851 elif isinstance(handle, str):
    852     # Check whether the filename is to be opened in binary mode.
    853     # Binary mode does not support 'encoding' and 'newline'.
    854     if ioargs.encoding and "b" not in ioargs.mode:
    855         # Encoding
--> 856         handle = open(
    857             handle,
    858             ioargs.mode,
    859             encoding=ioargs.encoding,
    860             errors=errors,
    861             newline="",
    862         )
    863     else:
    864         # Binary mode
    865         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'full_intersection_15.csv'

The data has 2563480 entires (trajectory points) with different characteristics such as object_id, timestamp (in ms), longitude and latitude, heading, height, width, length of the vehicle, velocity, tracking status and object_type

Exploring the data#

Area of Research#

The data was collected in Salzburg, Austria at the intersection of Sterneckstraße, Linzer Bunderrtraße, Ignaz-Härtl-Straße and Fürbergstraße

buffer = gpd.GeoSeries([Point(13.064405, 47.810136)], crs="EPSG:4326").to_crs(epsg=32633).buffer(200)
buffer_gdf = gpd.GeoDataFrame(geometry=buffer, crs =32633 )
buffer_gdf.explore(tiles='Esri.WorldImagery')
Make this Notebook Trusted to load map: File -> Trust Notebook

Time#

minData = datetime.datetime.fromtimestamp(int(min(data['timestamp'])/1000))
maxData = datetime.datetime.fromtimestamp(int(max(data['timestamp'])/1000))
print("It was collected from: %s, To: %s" % (minData, maxData))
It was collected from: 2024-04-02 16:00:00, To: 2024-04-02 16:59:59

Total observations#

print("Total observations in a raw dataset: %i" % (data.size))
print("Total unique objects: %i" % (data['object_id'].nunique()))
Total observations in a raw dataset: 28198280
Total unique objects: 34580

Tracking Status#

On of the main points characteristics is tracking status. Let’s see the structure for points based on this column:

data_by_status = data.groupby("tracking_status").size()
plt.figure(figsize=(5,5))
data_by_status.plot.pie(autopct="%.2f", colors=sns.color_palette("Set2"))
plt.title('Distribution by Tracking Status')
plt.show()
_images/7cf7d0b70d7f1748a031f66268fdf087fcbf9806627121fd029d47501a93288c.png

More than 82% of all points have “TRACKING” status which shows “stable tracking”. Other categories are “INVALIDATING”, “DRIFTING” and “VALIDATING” are not considered as ‘stable tracking’ and we will filter this data and won’t consider in the following analysis

Filterting Objects Based on Tracking Status and Calculating New Number of Observations#

data_cleaned = data[data['tracking_status']=='TRACKING']
print("Observations Total (cleaned data): %i. Share of raw: %i percent" % (data_cleaned.size, (data_cleaned.size/data.size)*100))
print("Total unique objects (cleaned data): %i. Share of raw: %i percent" % (data_cleaned['object_id'].nunique(), (data_cleaned['object_id'].nunique()/data['object_id'].nunique())*100))
Observations Total (cleaned data): 23349700. Share of raw: 82 percent
Total unique objects (cleaned data): 11025. Share of raw: 31 percent

Objects Types#

The data was collected for different object types: cars, two-wheelers, and pedestrians. Let’s explore the share of each object type in the data.

data_by_object = data_cleaned.groupby("object_type").size()

plt.figure(figsize=(5,5))
data_by_object.plot.pie(autopct="%.2f", colors=sns.color_palette("Set2"))

# Set title and show plot
plt.title('Distribution by Object Type')
plt.show()
_images/2d8e50ee59a39f2ce07d6b0ed94b74158ad67a2c1456ac3db4733c5d2fdf115d.png

More than 58% are cars, around 21% are pedestrians, and 4% are two-wheelers. Additionally, 15.78% are classified as “MISC,” which means that the objects are not considered part of any of the three main classes. We will filter out these objects and, in the following analysis, consider only the three main types.

Filterting Objects Based on Object Type and Calculating New Number of Observations#

data_cleaned = data_cleaned[data_cleaned['object_type'] != 'MISC']

Share of all data left after the first filters applied#

print("Observations Total (cleaned data): %i. Share of raw: %i percent" % (data_cleaned.size, (data_cleaned.size/data.size)*100))
print("Total unique objects (cleaned data): %i. Share of raw: %i percent" % (data_cleaned['object_id'].nunique(), (data_cleaned['object_id'].nunique()/data['object_id'].nunique())*100))
Observations Total (cleaned data): 19664513. Share of raw: 69 percent
Total unique objects (cleaned data): 6500. Share of raw: 18 percent

After filtering out irrelevant data based on “Tracking Status” and “Object Type,” we have prepared the data for further analysis. Approximately 70% of all points and only 18% of unique objects remain. This indicates that the filtered objects had significantly fewer points on average, supporting the idea that the filtered data was indeed irrelevant.

Data Processing#

In the following section, we will process the data to enhance opportunities for further analysis. We will divide the data into 5-minute intervals, create a GeoDataFrame to work with this dataset as spatial data, create lines (directions) for each object based on point data, and calculate various statistics for each object.

Dividing data into classes based on 5 minutes interval#

We are dividing our dataset into 5-minutes interval for the final estimations on the data quality assesment

min_timestamp = data_cleaned['timestamp'].min()

milliseconds = pd.Timedelta(minutes=5).total_seconds() * 1000

data_cleaned['time_class'] = ((data_cleaned['timestamp'] - min_timestamp) / milliseconds).astype(int)

data_cleaned.head()
object_id timestamp heading height width length v tracking_status object_type lon lat time_class
0 152997118 1712062811083 132.072 0.735 1.942 4.387 0.03 TRACKING CAR 13.064405 47.810136 0
1 152997118 1712062811183 132.286 0.731 1.861 4.320 0.04 TRACKING CAR 13.064405 47.810136 0
2 152997118 1712062811283 132.229 0.739 1.955 4.395 0.03 TRACKING CAR 13.064405 47.810136 0
3 152997118 1712062811386 131.980 0.729 1.942 4.367 0.04 TRACKING CAR 13.064405 47.810135 0
4 152997118 1712062811485 132.020 0.737 1.875 4.328 0.02 TRACKING CAR 13.064405 47.810136 0

Creating GeoData#

Based on coordinates provided in a csv file, we are creating points as spatial data - geoDataFrame

points_gdf = gpd.GeoDataFrame(data_cleaned, geometry=gpd.points_from_xy(data_cleaned['lon'], data_cleaned['lat']), crs=4326)

points_gdf.head()
object_id timestamp heading height width length v tracking_status object_type lon lat time_class geometry
0 152997118 1712062811083 132.072 0.735 1.942 4.387 0.03 TRACKING CAR 13.064405 47.810136 0 POINT (13.06440 47.81014)
1 152997118 1712062811183 132.286 0.731 1.861 4.320 0.04 TRACKING CAR 13.064405 47.810136 0 POINT (13.06441 47.81014)
2 152997118 1712062811283 132.229 0.739 1.955 4.395 0.03 TRACKING CAR 13.064405 47.810136 0 POINT (13.06440 47.81014)
3 152997118 1712062811386 131.980 0.729 1.942 4.367 0.04 TRACKING CAR 13.064405 47.810135 0 POINT (13.06441 47.81014)
4 152997118 1712062811485 132.020 0.737 1.875 4.328 0.02 TRACKING CAR 13.064405 47.810136 0 POINT (13.06441 47.81014)

Calcualte the distance between the first and the last point#

As one of the characteristics for the analysis we are caluclating the distance between the first and the last point for each object

points_gdf = points_gdf.to_crs(points_gdf.estimate_utm_crs())

# Function to calculate distance between first and last points
def calculate_distance(group):
    if len(group) > 1:
        first_point = group.head(1)['geometry'].iloc[0]
        last_point = group.tail(1)['geometry'].iloc[0]
        return first_point.distance(last_point)
    else:
        return None

# Group by object_id and calculate distances
distances = points_gdf.groupby('object_id').apply(calculate_distance).reset_index(name='distance_first_last')

# Merge distances back into the original GeoDataFrame
points_gdf = points_gdf.merge(distances, on='object_id', how='left')

points_gdf = points_gdf.to_crs(epsg=4326)

points_gdf.head()
object_id timestamp heading height width length v tracking_status object_type lon lat time_class geometry distance_first_last
0 152997118 1712062811083 132.072 0.735 1.942 4.387 0.03 TRACKING CAR 13.064405 47.810136 0 POINT (13.06440 47.81014) 0.165151
1 152997118 1712062811183 132.286 0.731 1.861 4.320 0.04 TRACKING CAR 13.064405 47.810136 0 POINT (13.06441 47.81014) 0.165151
2 152997118 1712062811283 132.229 0.739 1.955 4.395 0.03 TRACKING CAR 13.064405 47.810136 0 POINT (13.06440 47.81014) 0.165151
3 152997118 1712062811386 131.980 0.729 1.942 4.367 0.04 TRACKING CAR 13.064405 47.810135 0 POINT (13.06441 47.81014) 0.165151
4 152997118 1712062811485 132.020 0.737 1.875 4.328 0.02 TRACKING CAR 13.064405 47.810136 0 POINT (13.06441 47.81014) 0.165151

Calcualte average heading and velocity change#

As one of the characteristics for the analysis we are caluclating average heading and velocity change for each object to explore possible anomalies based on this parameters

def calculate_average_change(df, value_field):

    change_field = f'{value_field}_change' 

    avg_change_field = f'avg_{value_field}_change' 

    # Calculate heading change between consecutive points
    df[change_field] = df.groupby('object_id')[value_field].diff().fillna(0)

    # Calculate average heading change per object
    avg_change = df.groupby('object_id')[change_field].mean()

    # Update the original DataFrame with average heading change
    df = df.merge(avg_change.rename(avg_change_field), on='object_id', how='left')

    return df
# Calculate and update average heading change for each object in the DataFrame
points_gdf = calculate_average_change(points_gdf, 'heading')
points_gdf = calculate_average_change(points_gdf, 'v')

Creating Lines from Point Observations#

Creating lines from point observation to see the actual path for each object

# Group by 'object_id' and create LineString geometries
def create_linestring(group):
    if len(group) > 1:
        return LineString(group)
    return None


# Group by 'object_id' and create LineString geometries
lines = points_gdf.groupby('object_id')['geometry'].apply(lambda x: create_linestring(x.tolist()))


# Create a new GeoDataFrame for the lines
lines_gdf = gpd.GeoDataFrame(lines, geometry='geometry', crs=4326)
lines_gdf = lines_gdf.reset_index().rename(columns={'index': 'object_id'})
lines_gdf = lines_gdf[lines_gdf['geometry'].notnull()]

Calculate Statistics for each line#

Calculating different statistics for each line such as: overall time of movement, average heading, velocity, width, height and length

# Calculate the time difference (max - min timestamp) for each object_id
time_diff = points_gdf.groupby('object_id')['timestamp'].apply(lambda x: (x.max() - x.min()))

# Calculate average heading, average velocity, average width, average length for each object_id
avg_heading = points_gdf.groupby('object_id')['heading'].median()
avg_v = points_gdf.groupby('object_id')['v'].median()
avg_width = points_gdf.groupby('object_id')['width'].median()
avg_length = points_gdf.groupby('object_id')['length'].median()
avg_height = points_gdf.groupby('object_id')['height'].median()

# Combine the calculated statistics into a new DataFrame
statistics_df = pd.DataFrame({'total_time': time_diff, 'avg_heading': avg_heading, 'avg_v': avg_v, 'avg_width': avg_width, 'avg_length': avg_length, 'avg_height': avg_height})

Adding important Object characteristics from an initial DataFrame

# Add object_type column to the new DataFrame
statistics_df['object_type'] = points_gdf.groupby('object_id')['object_type'].first().values

# Add time_class column to the new DataFrame
statistics_df['time_class'] = points_gdf.groupby('object_id')['time_class'].first().values

# Add average heading change column to the new DataFrame
statistics_df['avg_heading_change'] = points_gdf.groupby('object_id')['avg_heading_change'].first().values

# Add average velocity change column to the new DataFrame
statistics_df['avg_v_change'] = points_gdf.groupby('object_id')['avg_v_change'].first().values

# Add distance between first and last points to the new DataFrame
statistics_df['distance_first_last'] = points_gdf.groupby('object_id')['distance_first_last'].first().values

Сalculate whether the type of the same object has changed

# Add a column with all unique object types for each object_id
unique_object_types = points_gdf.groupby('object_id')['object_type'].apply(lambda x: ', '.join(x.unique()))
statistics_df['unique_object_types'] = unique_object_types

# Add columns for each object_type and its occurrences
object_type_counts = points_gdf.groupby(['object_id', 'object_type']).size().unstack(fill_value=0)
statistics_df = statistics_df.join(object_type_counts, on='object_id')

# Calculate the percentage of each object type for each object_id
object_type_percentages = object_type_counts.div(object_type_counts.sum(axis=1), axis=0) * 100

# Determine the object type with the highest percentage for each object_id
top_object_type = object_type_percentages.idxmax(axis=1)
statistics_df['object_type'] = top_object_type

# Calculate the percentage of the top object_type for each object_id
top_object_type_percentage = object_type_percentages.max(axis=1)
statistics_df['top_object_type_percentage'] = top_object_type_percentage

# Reset index to have object_id as a column instead of index
statistics_df.reset_index(inplace=True)

Merging Statistics to GeoDataFrames with lines for each object

lines_gdf_stat = lines_gdf.merge(statistics_df, on='object_id', how='left')

Calculating length for each line

#Calcualte length of each line
lines_gdf_stat = lines_gdf_stat.to_crs(lines_gdf.estimate_utm_crs())
lines_gdf_stat['route_length'] = lines_gdf_stat['geometry'].length

lines_gdf_stat['total_time_s'] = lines_gdf_stat['total_time']/1000

lines_gdf_stat['direct_real_d_ratio'] = lines_gdf_stat['route_length']/lines_gdf_stat['distance_first_last']

lines_gdf_stat = lines_gdf_stat.to_crs(epsg=4326)

lines_gdf_stat.head()
object_id geometry total_time avg_heading avg_v avg_width avg_length avg_height object_type time_class ... avg_v_change distance_first_last unique_object_types CAR CYCLIST PEDESTRIAN top_object_type_percentage route_length total_time_s direct_real_d_ratio
0 152997118 LINESTRING (13.06440 47.81014, 13.06441 47.810... 286302 133.7800 0.03 1.869 4.3870 1.151 CAR 0 ... 0.000004 0.165151 CAR 2748 0 0 100.0 102.985753 286.302 623.585910
1 152997181 LINESTRING (13.06399 47.81006, 13.06399 47.810... 138700 148.0520 0.77 0.559 0.5585 1.332 PEDESTRIAN 0 ... -0.000261 65.637374 PEDESTRIAN 0 0 1342 100.0 110.534391 138.700 1.684016
2 152997182 LINESTRING (13.06413 47.81005, 13.06413 47.810... 270701 139.9335 0.02 1.912 4.5230 1.690 CAR 0 ... 0.000004 0.086359 CAR 2616 0 0 100.0 60.296137 270.701 698.199732
3 152997183 LINESTRING (13.06339 47.80977, 13.06339 47.809... 101799 105.4660 0.01 1.872 4.8670 1.566 CAR 0 ... 0.009214 119.628432 CAR 980 0 0 100.0 144.550702 101.799 1.208331
4 152997184 LINESTRING (13.06339 47.80978, 13.06340 47.809... 100897 97.4585 0.01 1.974 5.0000 1.615 CAR 0 ... 0.006307 111.192417 CAR 972 0 0 100.0 144.484505 100.897 1.299410

5 rows × 21 columns

In this section, we processed the data, created geodataframe from the CSV file, and calculated summary parameters for further analysis.

General Descriptive Satistics#

This section is about exploring object types, there change and summary statisctics

0. Object Types#

Let’s look at the destribution of objects based on type

# Function to standardize each string by sorting the items
def standardize_string(s):
    items = s.split(', ')
    sorted_items = sorted(items)
    return ', '.join(sorted_items)

# Apply the function to the 'unique_object_types' column
lines_gdf_stat['unique_object_types_sorted'] = lines_gdf_stat['unique_object_types'].apply(standardize_string)

# Calculate counts and sort values by frequency in descending order
counts = lines_gdf_stat['unique_object_types_sorted'].value_counts().sort_values(ascending=False)

# Create a palette of different colors
palette = sns.color_palette("husl", len(counts))

# Plot the histogram with sorted values and different colors for each bin
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=counts.index, y=counts.values, palette=palette)

# Calculate the total number of observations
total = len(lines_gdf_stat)

# Annotate each bar with the count and percentage
for p, count in zip(ax.patches, counts):
    height = p.get_height()
    percentage = f'{height / total * 100:.2f}%'
    ax.annotate(f'{int(height)}\n({percentage})', 
                (p.get_x() + p.get_width() / 2., height), 
                ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10)

# Remove the plot frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Set title and labels
plt.title('Distribution of Object Types')
plt.xlabel('Object Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
_images/78dcce705ce117d13f1290428057c89d70e5cde6c23449a5e4c3edba2b39c4ef.png

This graph is different from the one in the first chapter as it looks at object types not for individual points but for the objects as a whole, also considering changes in object type.

We can see that more than 60% of the objects are cars, around 22.4% are pedestrians, and only 1% are two-wheelers. The remaining 17% are objects that changed their type along the route for some reason. We will explore these categories in the following section.

Exploring Data with TypeChanges#

We are exploring the objects that were changing their type along the route and dividing them into 3 classes based on the share of the main type: less than 80%, between 80% and 90%, and between 90% and 100%

# Create the 'typeChanges' field
lines_gdf_stat['typeChanges'] = lines_gdf_stat['top_object_type_percentage'].apply(lambda x: 1 if x < 100 else 0)

# Create the 'typeChangesRate' field
def calculate_type_changes_rate(percentage):
    if percentage == 100:
        return 0
    elif 90 <= percentage < 100:
        return 0.1
    elif 80 <= percentage < 90:
        return 0.2
    else:
        return 1

lines_gdf_stat['typeChangesRate'] = lines_gdf_stat['top_object_type_percentage'].apply(calculate_type_changes_rate)

# Display the DataFrame with the new fields
lines_gdf_stat.head()
object_id geometry total_time avg_heading avg_v avg_width avg_length avg_height object_type time_class ... CAR CYCLIST PEDESTRIAN top_object_type_percentage route_length total_time_s direct_real_d_ratio unique_object_types_sorted typeChanges typeChangesRate
0 152997118 LINESTRING (13.06440 47.81014, 13.06441 47.810... 286302 133.7800 0.03 1.869 4.3870 1.151 CAR 0 ... 2748 0 0 100.0 102.985753 286.302 623.585910 CAR 0 0.0
1 152997181 LINESTRING (13.06399 47.81006, 13.06399 47.810... 138700 148.0520 0.77 0.559 0.5585 1.332 PEDESTRIAN 0 ... 0 0 1342 100.0 110.534391 138.700 1.684016 PEDESTRIAN 0 0.0
2 152997182 LINESTRING (13.06413 47.81005, 13.06413 47.810... 270701 139.9335 0.02 1.912 4.5230 1.690 CAR 0 ... 2616 0 0 100.0 60.296137 270.701 698.199732 CAR 0 0.0
3 152997183 LINESTRING (13.06339 47.80977, 13.06339 47.809... 101799 105.4660 0.01 1.872 4.8670 1.566 CAR 0 ... 980 0 0 100.0 144.550702 101.799 1.208331 CAR 0 0.0
4 152997184 LINESTRING (13.06339 47.80978, 13.06340 47.809... 100897 97.4585 0.01 1.974 5.0000 1.615 CAR 0 ... 972 0 0 100.0 144.484505 100.897 1.299410 CAR 0 0.0

5 rows × 24 columns

Filtering Data that has typeChanges along the route

lines_type_changes = lines_gdf_stat[lines_gdf_stat['typeChanges']>0]

Creating histogram to explore the distribution of objects based on the share of the main object

# Plot the histogram with a custom color
plt.figure(figsize=(10, 6))
ax = sns.histplot(lines_type_changes['top_object_type_percentage'], kde=False, color='skyblue')

# Calculate the total number of observations
total = len(lines_type_changes)

# Annotate each bar with the count and percentage
for p in ax.patches:
    height = p.get_height()
    percentage = f'{height / total * 100:.2f}%'
    ax.annotate(f'{int(height)}\n({percentage})', 
                (p.get_x() + p.get_width() / 2., height), 
                ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10)

# Remove the plot frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Set title and labels
plt.title('Distribution of Top Object Type Percentage')
plt.xlabel('Top Object Type Percentage')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
_images/112566e0acb6a7486985c630f1d3e6c184dd050c55de0ff7df520f57b5c36448.png

For the same task creating Stacked Bar Chart, to see it from different perspective

# Assume lines_gdf_stat is your DataFrame and it has the column 'top_object_type_percentage'

# Create bins with 5% width
bins = np.arange(0, 105, 5)  # Create bins from 0 to 100 with step 5
labels = [f'{i}-{i+5}%' for i in range(0, 100, 5)]

# Bin the data
lines_gdf_stat['bins'] = pd.cut(lines_gdf_stat['top_object_type_percentage'], bins=bins, labels=labels, right=False)

# Count the number of objects in each bin
bin_counts = lines_gdf_stat['bins'].value_counts().sort_index()

# Sort the bin counts in descending order for the legend
sorted_bin_counts = bin_counts.sort_values(ascending=False)

# Calculate the total number of objects
total = bin_counts.sum()

# Prepare the data for the stacked bar plot
bin_labels = sorted_bin_counts.index
counts = sorted_bin_counts.values
percentages = (counts / total) * 100

# Create an inverted gradient color palette
cmap = sns.color_palette("PuBuGn", len(counts))[::-1]

# Plot the stacked bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(['All Bins'], [sum(counts)], color='white', edgecolor='black', linewidth=1)

# Add stacked sections with gradient colors
bottom = 0
for count, percentage, label, color in zip(counts, percentages, bin_labels, cmap):
    plt.bar(['All Bins'], [count], bottom=bottom, label=f'{label} ({percentage:.2f}%)', color=color)
    bottom += count

# Annotate each section with white text
bottom = 0
for count, percentage in zip(counts, percentages):
    plt.text(0, bottom + count / 2, f'{count}\n({percentage:.2f}%)', ha='center', va='center', fontsize=10, color='white')
    bottom += count


# Remove the plot frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Set title and labels
plt.title('Stacked Bar Chart of Objects in Each 5% Bin')
plt.xlabel('Top Object Type Percentage Bins')
plt.ylabel('Number of Objects')
plt.legend(title='Bins', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
_images/e1cf341fff69fef51db507c4026be3c88fb21a23688ff5b6fc0a6cfb3db4acb4.png

More than 36% of objects that change their type, actually doesn’t change it too much and 90%+ of all points are considered as one type. 12% more have more than 90% of points with the same type.

This data help us to extract data that chages its type a lot and will be hard to analyse. We won’t filter initial data based on this variable but we will consider it as one of the parameters for data quality assesment

Let’s explore how the object types were changing (order of transition between types):

# Calculate counts and sort values by frequency in descending order
counts = lines_type_changes['unique_object_types'].value_counts().sort_values(ascending=False)

# Create a palette of different colors
palette = sns.color_palette("husl", len(counts))

# Plot the histogram with sorted values and different colors for each bin
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=counts.index, y=counts.values, palette=palette)

# Calculate the total number of observations
total = len(lines_type_changes)

# Annotate each bar with the count and percentage
for p, count in zip(ax.patches, counts):
    height = p.get_height()
    percentage = f'{height / total * 100:.2f}%'
    ax.annotate(f'{int(height)}\n({percentage})', 
                (p.get_x() + p.get_width() / 2., height), 
                ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10)

# Remove the plot frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Set title and labels
plt.title('Distribution of Object Types')
plt.xlabel('Object Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
_images/444e03c8e3be9d7c0bf3e1093a5b98c5d0db028637348669da9a45ff8ee19f6c.png

Most of the objects changed their type from pedestrian to a car and from pedestrian to cyclist (arounf 28% for both)

We were trying to explore, may be there will be some interesting situation for heading change or velocity change based for each of this groups. But based on the average parameter, couldnt find anything

 # Calculate average heading change per object
avg_heading_change = lines_type_changes.groupby('unique_object_types')['avg_heading_change'].mean()

 # Calculate average heading change per object
avg_v_change = lines_type_changes.groupby('unique_object_types')['avg_v_change'].mean()


print(avg_heading_change)
print(avg_v_change)
unique_object_types
CAR, CYCLIST               -0.121379
CAR, CYCLIST, PEDESTRIAN   -0.289290
CAR, PEDESTRIAN             1.940362
CAR, PEDESTRIAN, CYCLIST   -0.374118
CYCLIST, CAR                0.981525
CYCLIST, CAR, PEDESTRIAN    0.251706
CYCLIST, PEDESTRIAN        -0.368309
CYCLIST, PEDESTRIAN, CAR   -0.630435
PEDESTRIAN, CAR            -0.088528
PEDESTRIAN, CAR, CYCLIST   -0.495457
PEDESTRIAN, CYCLIST         0.125764
PEDESTRIAN, CYCLIST, CAR   -0.379689
Name: avg_heading_change, dtype: float64
unique_object_types
CAR, CYCLIST                0.010916
CAR, CYCLIST, PEDESTRIAN   -0.010641
CAR, PEDESTRIAN            -0.018787
CAR, PEDESTRIAN, CYCLIST    0.003939
CYCLIST, CAR               -0.010123
CYCLIST, CAR, PEDESTRIAN   -0.003026
CYCLIST, PEDESTRIAN         0.002024
CYCLIST, PEDESTRIAN, CAR   -0.033420
PEDESTRIAN, CAR             0.040450
PEDESTRIAN, CAR, CYCLIST    0.024540
PEDESTRIAN, CYCLIST         0.006920
PEDESTRIAN, CYCLIST, CAR    0.005258
Name: avg_v_change, dtype: float64

Additional Functions#

We have created some aditional functions that will help us explore the data for different object types:

  1. The function to create plots for different fieldsin a dataframe:

def plot_stats(df, x_field, y_field=None, z_field=None):
    # Create figure and axes for boxplot and histograms
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # Boxplot
    if y_field and z_field:
        sns.boxplot(data=df[[x_field, y_field, z_field]], ax=axes[0], palette='Set3')
        axes[0].set_title('Boxplot')
        axes[0].set_xlabel('Variables')
        axes[0].set_ylabel('Values')
    elif y_field:
        sns.boxplot(data=df[[x_field, y_field]], ax=axes[0], palette='Set3')
        axes[0].set_title('Boxplot')
        axes[0].set_xlabel('Variables')
        axes[0].set_ylabel('Values')
    else:
        sns.boxplot(data=df[[x_field]], ax=axes[0], palette='Set3')
        axes[0].set_title('Boxplot')
        axes[0].set_xlabel('Variables')
        axes[0].set_ylabel('Values')
    
    # Histogram for x_field
    sns.histplot(data=df, x=x_field, bins=10, kde=True, color='skyblue', ax=axes[1])
    axes[1].set_title(f'Histogram of {x_field}')
    axes[1].set_xlabel(f'{x_field}')
    axes[1].set_ylabel('Frequency')
    
    # Histogram for y_field (if provided)
    if y_field:
        sns.histplot(data=df, x=y_field, bins=10, kde=True, color='lightgreen', ax=axes[2])
        axes[2].set_title(f'Histogram of {y_field}')
        axes[2].set_xlabel(f'{y_field}')
        axes[2].set_ylabel('Frequency')
    elif z_field:
        sns.histplot(data=df, x=z_field, bins=10, kde=True, color='lightgreen', ax=axes[2])
        axes[2].set_title(f'Histogram of {z_field}')
        axes[2].set_xlabel(f'{z_field}')
        axes[2].set_ylabel('Frequency')
    else:
        axes[2].axis('off')  # Hide the third subplot if both y_field and z_field are not provided
    
    # Adjust layout
    plt.tight_layout()
    
    # Show plots
    plt.show()
    
    # Calculate statistics for x_field, y_field, and z_field (if provided)
    stats_x = df[x_field].describe()
    print(f"\nStatistics for {x_field}:")
    print(stats_x)
    
    if y_field:
        stats_y = df[y_field].describe()
        print(f"\nStatistics for {y_field}:")
        print(stats_y)
    
    if z_field:
        stats_z = df[z_field].describe()
        print(f"\nStatistics for {z_field}:")
        print(stats_z)
  1. The function to create a set of plots for specific fields in a dataframe:

def analyze_dataset(dataset):

    # 1: Plots and statistics for object dimention characteristics (width, length, height)
    plot_stats(dataset, 'avg_width', 'avg_length', 'avg_height')

    # 2: Plots and statistics for object route_length and distance between first and last point
    plot_stats(dataset, 'route_length', 'distance_first_last')

    # 3: Plots and statistics for ratio between route length and distance between first and last point
    plot_stats(dataset, 'direct_real_d_ratio')

    # 4: Plots and statistics for total time
    plot_stats(dataset, 'total_time')

    # 5: Plots and statistics for average velocity
    plot_stats(dataset, 'avg_v')

1. Cars#

lines_gdf_cars = lines_gdf_stat[lines_gdf_stat['object_type'] =='CAR']
lines_gdf_cars.head()
object_id geometry total_time avg_heading avg_v avg_width avg_length avg_height object_type time_class ... CYCLIST PEDESTRIAN top_object_type_percentage route_length total_time_s direct_real_d_ratio unique_object_types_sorted typeChanges typeChangesRate bins
0 152997118 LINESTRING (13.06440 47.81014, 13.06441 47.810... 286302 133.7800 0.03 1.869 4.387 1.151 CAR 0 ... 0 0 100.0 102.985753 286.302 623.585910 CAR 0 0.0 NaN
2 152997182 LINESTRING (13.06413 47.81005, 13.06413 47.810... 270701 139.9335 0.02 1.912 4.523 1.690 CAR 0 ... 0 0 100.0 60.296137 270.701 698.199732 CAR 0 0.0 NaN
3 152997183 LINESTRING (13.06339 47.80977, 13.06339 47.809... 101799 105.4660 0.01 1.872 4.867 1.566 CAR 0 ... 0 0 100.0 144.550702 101.799 1.208331 CAR 0 0.0 NaN
4 152997184 LINESTRING (13.06339 47.80978, 13.06340 47.809... 100897 97.4585 0.01 1.974 5.000 1.615 CAR 0 ... 0 0 100.0 144.484505 100.897 1.299410 CAR 0 0.0 NaN
7 152997753 LINESTRING (13.06394 47.80935, 13.06394 47.809... 109398 34.2955 0.04 2.227 5.777 2.422 CAR 0 ... 0 0 100.0 153.183233 109.398 2.013326 CAR 0 0.0 NaN

5 rows × 25 columns

СARS DIMENTIONS (width and height, length)#

analyze_dataset(lines_gdf_cars)
_images/a09631924d0ed6e2f723e0a022312150e845a62ae3170a2fe087b7e590eef3ed.png
Statistics for avg_width:
count    4178.000000
mean        1.899717
std         0.577520
min         0.054000
25%         1.854250
50%         1.923500
75%         1.989000
max        18.495000
Name: avg_width, dtype: float64

Statistics for avg_length:
count    4178.000000
mean        4.008679
std         2.248963
min         0.009000
25%         4.023000
50%         4.395000
75%         4.664000
max        18.706000
Name: avg_length, dtype: float64

Statistics for avg_height:
count    4178.000000
mean        1.136760
std         0.694165
min         0.004000
25%         0.636625
50%         1.145000
75%         1.425000
max         3.071000
Name: avg_height, dtype: float64
_images/dc621c59493759e3467fd7c011553eb5e42e0dccd150f8771c891e789afc75ee.png
Statistics for route_length:
count    4178.000000
mean       75.322052
std        61.633961
min         0.000000
25%        21.538840
50%        77.358236
75%       112.704000
max       784.263218
Name: route_length, dtype: float64

Statistics for distance_first_last:
count    4178.000000
mean       56.380821
std        43.919954
min         0.000000
25%         9.259288
50%        58.883808
75%        93.647267
max       149.418347
Name: distance_first_last, dtype: float64
_images/038ed84d825923ad467a2036c0fb5f3b5944e6283209114bdffc5f3f767653ca.png
Statistics for direct_real_d_ratio:
count    4176.000000
mean             inf
std              NaN
min         1.000000
25%         1.055816
50%         1.156346
75%         1.455117
max              inf
Name: direct_real_d_ratio, dtype: float64
_images/20191a0a27e46e3511e651e71e1eb58d8e6f8aac509e6d13e7f80c126f2f1863.png
Statistics for total_time:
count      4178.000000
mean      31888.159167
std       44217.103261
min          91.000000
25%        5098.500000
50%       14200.000000
75%       51098.000000
max      975498.000000
Name: total_time, dtype: float64
_images/c79d95e853723a79301b8dde9b3b4fbe066d658ea37a52027ebbf2ef957d0e1f.png
Statistics for avg_v:
count    4178.000000
mean        3.500172
std         3.425574
min         0.000000
25%         0.120000
50%         2.662500
75%         6.160000
max        14.745000
Name: avg_v, dtype: float64

Based on this cahracteristics we can distinguish statistical outliers and use it as a data quality metrics

def detect_outliers(df, columns):
    # Create a copy of the DataFrame to avoid modifying the original
    df_outliers = df.copy()
    
    for col in columns:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile) for the column
        Q1 = df_outliers[col].quantile(0.25)
        Q3 = df_outliers[col].quantile(0.75)
        
        # Calculate IQR (Interquartile Range)
        IQR = Q3 - Q1
        
        # Determine outlier boundaries
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Create a new field indicating outliers
        outlier_field = f"{col}_outlier"
        df_outliers[outlier_field] = ((df_outliers[col] < lower_bound) | (df_outliers[col] > upper_bound)).astype(int)
        
    # Calculate the sum of outliers for each row
    outlier_columns = [f"{col}_outlier" for col in columns]
    df_outliers['total_outliers'] = df_outliers[outlier_columns].sum(axis=1)
    
    return df_outliers
columns_to_check = ['total_time', 'avg_v', 'avg_width', 'avg_height', 'avg_length', 'route_length', 'distance_first_last']
cars_outliers = detect_outliers(lines_gdf_cars, columns_to_check)
cars_outliers.head()
object_id geometry total_time avg_heading avg_v avg_width avg_length avg_height object_type time_class ... direction_class avg_heading_rad total_time_outlier avg_v_outlier avg_width_outlier avg_height_outlier avg_length_outlier route_length_outlier distance_first_last_outlier total_outliers
0 152997118 LINESTRING (13.06440 47.81014, 13.06441 47.810... 286302 133.7800 0.03 1.869 4.387 1.151 CAR 0 ... 1 2.334901 1 0 0 0 0 0 0 1
1 152997182 LINESTRING (13.06413 47.81005, 13.06413 47.810... 270701 139.9335 0.02 1.912 4.523 1.690 CAR 0 ... 1 2.442300 1 0 0 0 0 0 0 1
2 152997183 LINESTRING (13.06339 47.80977, 13.06339 47.809... 101799 105.4660 0.01 1.872 4.867 1.566 CAR 0 ... 1 1.840729 0 0 0 0 0 0 0 0
3 152997184 LINESTRING (13.06339 47.80978, 13.06340 47.809... 100897 97.4585 0.01 1.974 5.000 1.615 CAR 0 ... 1 1.700972 0 0 0 0 0 0 0 0
4 152997753 LINESTRING (13.06394 47.80935, 13.06394 47.809... 109398 34.2955 0.04 2.227 5.777 2.422 CAR 0 ... 0 0.598569 0 0 1 0 1 0 0 2

5 rows × 37 columns

# Calculate the counts and percentages of each combination of time_class and total_outliers
counts = cars_outliers.groupby(['time_class', 'total_outliers']).size().reset_index(name='count')
total_counts = counts.groupby('time_class')['count'].transform('sum')
counts['percentage'] = (counts['count'] / total_counts) * 100

# Pivot the data to prepare for plotting
pivot_counts = counts.pivot(index='time_class', columns='total_outliers', values='percentage').fillna(0)

# Plotting
colors = ['skyblue', 'lightgreen', 'salmon', 'orange', 'lightcoral', 'gray']  # Define additional colors
ax = pivot_counts.plot(kind='bar', stacked=True, color=colors, figsize=(8, 6))

# Add labels and title
ax.set_title('Object Counts by time_class and total_outliers', fontsize=14)
ax.set_xlabel('time_class', fontsize=12)
ax.set_ylabel('Percentage of Objects (%)', fontsize=12)

ax.legend(title='total_outliers', fontsize=10)

# Show percentages inside each bar segment
for i in range(pivot_counts.shape[0]):
    total = pivot_counts.iloc[i].sum()
    cum_sum = 0
    for j in range(pivot_counts.shape[1]):
        value = pivot_counts.iloc[i, j]
        ax.text(i, cum_sum + value / 2, f'{value:.1f}%', ha='center', va='center', color='gray', fontsize=7)
        cum_sum += value

plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
_images/c9ea72b2a96d5572ff7d17ab4ea2ab0243e64e896653342523ae01b13b2b8251.png

2. Pedestrians#

lines_gdf_pedestrians = lines_gdf_stat[lines_gdf_stat['object_type'] =='PEDESTRIAN']
analyze_dataset(lines_gdf_pedestrians)
_images/d44904200cec1bdcd0e6d12d03be1de509b3bdd66f882f79f07640d2d62e64cb.png
Statistics for avg_width:
count    1769.000000
mean        0.517953
std         0.210210
min         0.064000
25%         0.368000
50%         0.503500
75%         0.624000
max         1.491000
Name: avg_width, dtype: float64

Statistics for avg_length:
count    1769.000000
mean        0.499413
std         0.242799
min         0.092000
25%         0.315000
50%         0.470000
75%         0.595000
max         2.364000
Name: avg_length, dtype: float64

Statistics for avg_height:
count    1769.000000
mean        1.109840
std         0.360861
min         0.048500
25%         0.849000
50%         1.123500
75%         1.372500
max         2.450000
Name: avg_height, dtype: float64
_images/54e186f3bb86b28800d04e8cc7d7a859c34f9ab2f979036b253ca27c5ec66d2b.png
Statistics for route_length:
count    1769.000000
mean       24.235552
std        44.936549
min         0.000000
25%         1.203799
50%         6.902049
75%        28.358431
max       895.087643
Name: route_length, dtype: float64

Statistics for distance_first_last:
count    1769.000000
mean       14.300289
std        25.493666
min         0.000000
25%         0.245034
50%         2.382743
75%        14.300744
max       133.101487
Name: distance_first_last, dtype: float64
_images/5e5d34109c48ad504b71fe3f38edbbeb51c70f8acddc35d6f2827bde24bcc389.png
Statistics for direct_real_d_ratio:
count    1760.000000
mean             inf
std              NaN
min         1.000000
25%         1.092678
50%         1.405990
75%         4.543949
max              inf
Name: direct_real_d_ratio, dtype: float64
_images/97d152ed1ea2576014abae76f06cc495411b79af8b52099a92009a39d051a123.png
Statistics for total_time:
count      1769.000000
mean      29624.931034
std       56806.996871
min          93.000000
25%        2400.000000
50%        8596.000000
75%       33999.000000
max      769198.000000
Name: total_time, dtype: float64
_images/a73cfcd8b1bdf1336707ccd99c192b9b562d2dbc5ef74c3f347bb8269f3c3740.png
Statistics for avg_v:
count    1769.000000
mean        0.726439
std         0.721659
min         0.000000
25%         0.070000
50%         0.600000
75%         1.300000
max         6.880000
Name: avg_v, dtype: float64
columns_to_check = ['total_time', 'avg_v', 'avg_width', 'avg_height', 'avg_length', 'route_length', 'distance_first_last']
pedestrians_outliers = detect_outliers(lines_gdf_pedestrians, columns_to_check)
pedestrians_outliers.head()
object_id geometry total_time avg_heading avg_v avg_width avg_length avg_height object_type time_class ... direction_class avg_heading_rad total_time_outlier avg_v_outlier avg_width_outlier avg_height_outlier avg_length_outlier route_length_outlier distance_first_last_outlier total_outliers
1 152997181 LINESTRING (13.06399 47.81006, 13.06399 47.810... 138700 148.052 0.770 0.559 0.5585 1.3320 PEDESTRIAN 0 ... 1 2.583995 1 0 0 0 0 1 1 3
5 152997343 LINESTRING (13.06429 47.80957, 13.06429 47.809... 108398 228.052 0.100 0.659 0.6375 1.0935 PEDESTRIAN 0 ... 2 3.980258 1 0 0 0 0 0 0 1
6 152997466 LINESTRING (13.06442 47.80944, 13.06441 47.809... 115007 233.052 0.070 0.459 0.5000 1.4510 PEDESTRIAN 0 ... 3 4.067525 1 0 0 0 0 1 1 3
8 152997836 LINESTRING (13.06453 47.80964, 13.06453 47.809... 109009 134.466 0.020 0.574 0.6820 1.2320 PEDESTRIAN 0 ... 1 2.346874 1 0 0 0 0 1 1 3
9 152997959 LINESTRING (13.06450 47.80986, 13.06451 47.809... 116001 223.052 0.585 0.448 0.3850 1.1940 PEDESTRIAN 0 ... 2 3.892992 1 0 0 0 0 1 0 2

5 rows × 35 columns

# Calculate the counts and percentages of each combination of time_class and total_outliers
counts = pedestrians_outliers.groupby(['time_class', 'total_outliers']).size().reset_index(name='count')
total_counts = counts.groupby('time_class')['count'].transform('sum')
counts['percentage'] = (counts['count'] / total_counts) * 100

# Pivot the data to prepare for plotting
pivot_counts = counts.pivot(index='time_class', columns='total_outliers', values='percentage').fillna(0)

# Plotting
colors = ['skyblue', 'lightgreen', 'salmon', 'orange', 'lightcoral']  # Define additional colors
ax = pivot_counts.plot(kind='bar', stacked=True, color=colors, figsize=(8, 6))

# Add labels and title
ax.set_title('Object Counts by time_class and total_outliers', fontsize=14)
ax.set_xlabel('time_class', fontsize=12)
ax.set_ylabel('Percentage of Objects (%)', fontsize=12)

ax.legend(title='total_outliers', fontsize=10)

# Show percentages inside each bar segment
for i in range(pivot_counts.shape[0]):
    total = pivot_counts.iloc[i].sum()
    cum_sum = 0
    for j in range(pivot_counts.shape[1]):
        value = pivot_counts.iloc[i, j]
        ax.text(i, cum_sum + value / 2, f'{value:.1f}%', ha='center', va='center', color='gray', fontsize=7)
        cum_sum += value

plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
_images/13982e063d36edf4e0c85321a3b8bb9b74c8825e5b9a9220dce769af96bcb3b5.png

3. Two-Wheelers#

lines_gdf_two_wheelers = lines_gdf_stat[lines_gdf_stat['object_type'] =='CYCLIST']
analyze_dataset(lines_gdf_two_wheelers)
_images/cbc6b9eae98874dbd7f2e6267384bac91d746f201d6efbdb5fecaab6d49045d7.png
Statistics for avg_width:
count    250.000000
mean       0.632556
std        0.207776
min        0.112500
25%        0.541625
50%        0.623000
75%        0.697250
max        1.714000
Name: avg_width, dtype: float64

Statistics for avg_length:
count    250.000000
mean       1.252888
std        0.498308
min        0.076000
25%        0.898750
50%        1.250000
75%        1.629125
max        3.735000
Name: avg_length, dtype: float64

Statistics for avg_height:
count    250.000000
mean       1.285610
std        0.310616
min        0.019000
25%        1.155250
50%        1.344250
75%        1.445750
max        2.803000
Name: avg_height, dtype: float64
_images/a256fb9425f019eb5b6ab9ee009982797f7d91872b6f2b6ce4f7449c4e17ce9a.png
Statistics for route_length:
count    250.000000
mean      76.275553
std       58.753900
min        0.022234
25%       22.881110
50%       84.569250
75%      113.700210
max      559.920694
Name: route_length, dtype: float64

Statistics for distance_first_last:
count    250.000000
mean      63.831372
std       43.892981
min        0.000000
25%       18.486127
50%       75.049915
75%       99.808464
max      135.982695
Name: distance_first_last, dtype: float64
_images/8a3297e87d5b3f916d3c6887513525838478718b856b69b2578702c78c8f9fed.png
Statistics for direct_real_d_ratio:
count    250.000000
mean            inf
std             NaN
min        1.000000
25%        1.026796
50%        1.087646
75%        1.246673
max             inf
Name: direct_real_d_ratio, dtype: float64
_images/906eca88f9723e24172f7f907828a93329cc1f14df830a97af5eeee177124fd9.png
Statistics for total_time:
count       250.000000
mean      31482.288000
std       33844.385272
min          94.000000
25%        8623.250000
50%       19600.000000
75%       41577.000000
max      216099.000000
Name: total_time, dtype: float64
_images/c90f961b28a24bfe0aa1a272444177ac33cc5f2219e58b40e203a670bea76687.png
Statistics for avg_v:
count    250.000000
mean       3.245340
std        2.823628
min        0.000000
25%        1.022500
50%        2.695000
75%        5.335000
max       13.430000
Name: avg_v, dtype: float64
columns_to_check = ['total_time', 'avg_v', 'avg_width', 'avg_height', 'avg_length', 'route_length', 'distance_first_last']
two_wheelers_outliers = detect_outliers(lines_gdf_two_wheelers, columns_to_check)
# Calculate the counts and percentages of each combination of time_class and total_outliers
counts = two_wheelers_outliers.groupby(['time_class', 'total_outliers']).size().reset_index(name='count')
total_counts = counts.groupby('time_class')['count'].transform('sum')
counts['percentage'] = (counts['count'] / total_counts) * 100

# Pivot the data to prepare for plotting
pivot_counts = counts.pivot(index='time_class', columns='total_outliers', values='percentage').fillna(0)

# Plotting
colors = ['skyblue', 'lightgreen', 'salmon', 'orange', 'lightcoral']  # Define additional colors
ax = pivot_counts.plot(kind='bar', stacked=True, color=colors, figsize=(8, 6))

# Add labels and title
ax.set_title('Object Counts by time_class and total_outliers', fontsize=14)
ax.set_xlabel('time_class', fontsize=12)
ax.set_ylabel('Percentage of Objects (%)', fontsize=12)

ax.legend(title='total_outliers', fontsize=10)

# Show percentages inside each bar segment
for i in range(pivot_counts.shape[0]):
    total = pivot_counts.iloc[i].sum()
    cum_sum = 0
    for j in range(pivot_counts.shape[1]):
        value = pivot_counts.iloc[i, j]
        ax.text(i, cum_sum + value / 2, f'{value:.1f}%', ha='center', va='center', color='gray', fontsize=7)
        cum_sum += value

plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
_images/5d57215e07091d25d57c06df69171ace67c09e691f8b80af96671a79f4a5d9f4.png

Quality Assesment#

Defining directions and calculating number of cars in each direction. Option 1#

Creating Starting points#

starting_points = points_gdf.groupby('object_id').first().reset_index().set_crs(epsg=4326)
starting_points.head()
object_id timestamp heading height width length v tracking_status object_type lon lat time_class geometry distance_first_last heading_change avg_heading_change v_change avg_v_change
0 152997118 1712062811083 132.072 0.735 1.942 4.387 0.03 TRACKING CAR 13.064405 47.810136 0 POINT (13.06440 47.81014) 0.165151 0.0 0.065886 0.0 0.000004
1 152997181 1712062820184 198.052 1.458 0.641 0.366 1.10 TRACKING PEDESTRIAN 13.063991 47.810058 0 POINT (13.06399 47.81006) 65.637374 0.0 -0.048435 0.0 -0.000261
2 152997182 1712062820083 319.543 1.690 1.916 4.555 0.02 TRACKING CAR 13.064130 47.810046 0 POINT (13.06413 47.81005) 0.086359 0.0 -0.068684 0.0 0.000004
3 152997183 1712062822387 103.052 0.539 1.923 4.550 1.92 TRACKING CAR 13.063387 47.809774 0 POINT (13.06339 47.80977) 119.628432 0.0 -0.081633 0.0 0.009214
4 152997184 1712062824786 85.395 1.207 1.981 4.367 2.18 TRACKING CAR 13.063389 47.809776 0 POINT (13.06339 47.80978) 111.192417 0.0 -0.053851 0.0 0.006307
starting_points.explore(tiles='Esri.WorldImagery')
Make this Notebook Trusted to load map: File -> Trust Notebook
ending_points = points_gdf.groupby('object_id').last().reset_index().set_crs(epsg=4326)
ending_points.head()
object_id timestamp heading height width length v tracking_status object_type lon lat time_class geometry distance_first_last heading_change avg_heading_change v_change avg_v_change
0 152997118 1712063097385 313.127 0.348 1.955 4.500 0.04 TRACKING CAR 13.064403 47.810137 0 POINT (13.06440 47.81014) 0.165151 -0.696 0.065886 0.03 0.000004
1 152997181 1712062958884 133.052 0.528 0.645 0.209 0.75 TRACKING PEDESTRIAN 13.064302 47.809506 0 POINT (13.06430 47.80951) 65.637374 75.000 -0.048435 0.05 -0.000261
2 152997182 1712063090784 139.865 0.119 1.883 4.461 0.03 TRACKING CAR 13.064130 47.810045 0 POINT (13.06413 47.81005) 0.086359 0.126 -0.068684 0.00 0.000004
3 152997183 1712062924186 23.052 0.017 1.880 4.287 10.95 TRACKING CAR 13.064855 47.810200 0 POINT (13.06485 47.81020) 119.628432 -15.000 -0.081633 -0.40 0.009214
4 152997184 1712062925683 33.052 0.596 1.959 4.330 8.31 TRACKING CAR 13.064773 47.810138 0 POINT (13.06477 47.81014) 111.192417 -9.464 -0.053851 -2.54 0.006307
ending_points.explore(tiles='Esri.WorldImagery')
Make this Notebook Trusted to load map: File -> Trust Notebook

CARS Directions#

def create_regular_grid(gdf, square_size):

    #calculating UTM zone for the data
    utm_zone = gdf.estimate_utm_crs()
    #перепроецируем набор данных
    gdf = gdf.to_crs(utm_zone)
    minX, minY, maxX, maxY = gdf.total_bounds
    
    grid_cells = []
    x, y = minX, minY

    while y <= maxY:
        while x <= maxX:
            geom = Polygon([(x, y), (x, y + square_size), (x + square_size, y + square_size), (x + square_size, y), (x, y)])
            grid_cells.append(geom)
            x += square_size
        x = minX
        y += square_size

    fishnet = gpd.GeoDataFrame(geometry=grid_cells, crs=utm_zone)
    fishnet['grid_id'] = range(len(grid_cells))
    fishnet = fishnet.to_crs(epsg=4326)
    return fishnet
startingPointsCars = starting_points[starting_points['object_type']=="CAR"]
endingPointsCars = ending_points[ending_points['object_type']=="CAR"]

startEndPointsCars = pd.concat([startingPointsCars, endingPointsCars], ignore_index=True)
grid = create_regular_grid(startEndPointsCars, 7.5)
grid.explore(tiles='Esri.WorldImagery')
Make this Notebook Trusted to load map: File -> Trust Notebook
points_grid = gpd.sjoin(startEndPointsCars, grid, predicate='within')
points_count = points_grid.groupby('grid_id').size().reset_index(name='points_count')
grid_with_count = grid.merge(points_count, on='grid_id', how='left')

grid_with_count.head()
geometry grid_id points_count
0 POLYGON ((13.06309 47.80930, 13.06309 47.80937... 0 NaN
1 POLYGON ((13.06319 47.80930, 13.06319 47.80937... 1 NaN
2 POLYGON ((13.06329 47.80930, 13.06329 47.80937... 2 NaN
3 POLYGON ((13.06339 47.80930, 13.06339 47.80937... 3 NaN
4 POLYGON ((13.06349 47.80931, 13.06349 47.80937... 4 NaN
grid_main_clusters = grid_with_count[grid_with_count['points_count']>25]
grid_with_count.explore(column='points_count', tiles='Esri.WorldImagery')
Make this Notebook Trusted to load map: File -> Trust Notebook

Create Clusters#

Function to find clusters of polygons based on spatial relationships (touches or intersects:

def find_polygon_clusters(gdf):
    # Ensure geometries are valid
    gdf = gdf[gdf.is_valid]
    
    # Initialize a dictionary to store cluster names
    cluster_names = {}
    
    # Initialize an empty list to store groups of neighboring polygons
    polygon_groups = []
    
    # Counter for cluster names
    cluster_counter = 1
    
    # Loop through each polygon and find its neighbors
    for idx, polygon in gdf.iterrows():
        if idx not in cluster_names:
            cluster_names[idx] = f'Cluster{cluster_counter}'
            cluster_counter += 1
        
        # Find neighboring polygons (e.g., polygons that touch or intersect)
        neighbors = gdf[gdf.geometry.touches(polygon['geometry']) | gdf.geometry.intersects(polygon['geometry'])]
        
        # Check if the polygon and its neighbors form a MultiPolygon
        if len(neighbors) > 0:
            merged_geometry = MultiPolygon([polygon['geometry']] + list(neighbors['geometry']))
            polygon_groups.append(merged_geometry)
            
            # Assign the same cluster name to all neighbors
            for neighbor_idx in neighbors.index:
                if neighbor_idx not in cluster_names:
                    cluster_names[neighbor_idx] = cluster_names[idx]
    
    # Assign cluster names to the GeoDataFrame
    gdf['cluster'] = gdf.index.map(cluster_names)
    
    return gdf
cars_clusters = find_polygon_clusters(grid_main_clusters)
cars_clusters .explore(column='cluster', tiles='Esri.WorldImagery')
Make this Notebook Trusted to load map: File -> Trust Notebook
startingPointsCars_CLUSTER = gpd.sjoin(startingPointsCars.to_crs(cars_clusters .crs), cars_clusters , predicate='within')
endingPointsCars_CLUSTER = gpd.sjoin(endingPointsCars.to_crs(cars_clusters .crs), cars_clusters , predicate='within')

startingPointsCars_CLUSTER['start_cluster'] = startingPointsCars_CLUSTER['cluster']
endingPointsCars_CLUSTER['end_cluster'] = endingPointsCars_CLUSTER['cluster']
startingPointsCars_CLUSTER.explore(column="cluster",tiles='Esri.WorldImagery')
Make this Notebook Trusted to load map: File -> Trust Notebook
endingPointsCars_CLUSTER.explore(column="cluster", tiles='Esri.WorldImagery')
Make this Notebook Trusted to load map: File -> Trust Notebook

Getting Directions

lines_gdf_cars = lines_gdf_cars.merge(startingPointsCars_CLUSTER[['object_id','start_cluster']], on="object_id", how="left")
lines_gdf_cars = lines_gdf_cars.merge(endingPointsCars_CLUSTER[['object_id','end_cluster']], on="object_id", how="left")
lines_gdf_cars.groupby(['start_cluster', 'end_cluster']).size()
start_cluster  end_cluster
Cluster1       Cluster1        36
               Cluster2         2
               Cluster3        14
               Cluster4       140
Cluster2       Cluster1       118
               Cluster2        66
               Cluster3       383
               Cluster4       114
Cluster3       Cluster1        31
               Cluster2       490
               Cluster3       298
               Cluster4       260
Cluster4       Cluster1       118
               Cluster2       249
               Cluster3       248
               Cluster4       922
Cluster5       Cluster5        18
dtype: int64

Defining directions and calculating number of cars in each direction. Option 2#

The previous approach unfortunalty can’t be applied for pedestrians and two-wheelers as their routes are less clear distingishable and not strait. So, for pedestrians and two-wheelers we will define their directions baes on the intersection with crosslines created by us in QGIS

Defining directions and calculating number of cars in each direction. Option 3#

This approach is very easy to implement but it is less precise in comparison to the previous two. We have started with it and want to show it anyway as it was a part of our work and it still shows some overall information about directions

Calculating amount of objects in different directions for cars#

cars_heading_breaks = jenkspy.jenks_breaks(lines_gdf_cars['avg_heading'], 5)

# Создаем столбец для класса направления на основе разрывов
lines_gdf_cars['direction_class'] = pd.cut(lines_gdf_cars['avg_heading'], bins=cars_heading_breaks, labels=False, include_lowest=True)

# Считаем количество объектов в каждом классе направления
direction_class_counts = lines_gdf_cars['direction_class'].value_counts().sort_index()

print("Breaks:", cars_heading_breaks)
print("Direction Class Counts:\n", direction_class_counts)
Breaks: [8.052, 70.552, 158.05200000000002, 236.827, 276.276, 353.052]
Direction Class Counts:
 0     642
1    1072
2    1361
3     641
4     462
Name: direction_class, dtype: int64
# Creating Histogram
sns.histplot(data=lines_gdf_cars, x='avg_heading', hue='direction_class', bins=20, alpha=0.8, palette='BuPu', edgecolor='lightgray')

# Removing graph border
sns.despine()

# Adding titlle and axis names
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram, bins colored by class')

# Showing graph
plt.show()
_images/6dcbfe9362ceda41dd9144d7de3dbe38872b09e62ada3fd47abeb71756553ef9.png
# Convert angles from degrees to radians for the polar plot
lines_gdf_cars['avg_heading_rad'] = np.deg2rad(lines_gdf_cars['avg_heading'])

# Create the polar plot
plt.figure(figsize=(10, 6))
ax = plt.subplot(111, projection='polar')

# Configure the polar plot: 0 degrees at the top and clockwise direction
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)

# Define the number of bins
num_bins = 8
bins = np.linspace(0, 2 * np.pi, num_bins + 1)

# Count the number of objects in each bin
hist, edges = np.histogram(lines_gdf_cars['avg_heading_rad'], bins=bins)

# Plot the histogram on the polar plot
bars = ax.bar(edges[:-1], hist, width=np.diff(edges), align='edge', edgecolor='black')

# Set angle labels in degrees
ax.set_xticks(np.deg2rad(np.arange(0, 360, 45)))
ax.set_xticklabels(['0°', '45°', '90°', '135°', '180°', '225°', '270°', '315°'])

# Add title
plt.title('Distribution of Objects by Avg Heading')

# Display the plot
plt.show()
_images/c8526fe3d4f0af61eced4f207ee4dfda22d2808bebc1a776d1b4d52984ba751c.png

Calculating amount of objects in different directions for pedestrians#

pedestrians_heading_breaks = jenkspy.jenks_breaks(lines_gdf_pedestrians['avg_heading'], 5)

# Создаем столбец для класса направления на основе разрывов
lines_gdf_pedestrians['direction_class'] = pd.cut(lines_gdf_pedestrians['avg_heading'], bins=pedestrians_heading_breaks, labels=False, include_lowest=True)

# Считаем количество объектов в каждом классе направления
direction_class_counts = lines_gdf_pedestrians['direction_class'].value_counts().sort_index()

print("Breaks:", pedestrians_heading_breaks)
print("Direction Class Counts:\n", direction_class_counts)
Breaks: [3.052, 89.59200000000001, 160.552, 228.439, 285.552, 358.052]
Direction Class Counts:
 0    388
1    371
2    411
3    358
4    241
Name: direction_class, dtype: int64
/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/geopandas/geodataframe.py:1538: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
# Creating Histogram
sns.histplot(data=lines_gdf_pedestrians, x='avg_heading', hue='direction_class', bins=20, alpha=0.8, palette='BuPu', edgecolor='lightgray')

# Removing graph border
sns.despine()

# Adding titlle and axis names
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram, bins colored by class')

# Showing graph
plt.show()
_images/90ee75afc1518faa1f4fda568fcbf255731c9944abfb51791a1f149c76069a4d.png
# Convert angles from degrees to radians for the polar plot
lines_gdf_pedestrians['avg_heading_rad'] = np.deg2rad(lines_gdf_pedestrians['avg_heading'])

# Create the polar plot
plt.figure(figsize=(10, 6))
ax = plt.subplot(111, projection='polar')

# Configure the polar plot: 0 degrees at the top and clockwise direction
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)

# Define the number of bins
num_bins = 8
bins = np.linspace(0, 2 * np.pi, num_bins + 1)

# Count the number of objects in each bin
hist, edges = np.histogram(lines_gdf_pedestrians['avg_heading_rad'], bins=bins)

# Plot the histogram on the polar plot
bars = ax.bar(edges[:-1], hist, width=np.diff(edges), align='edge', edgecolor='black')

# Set angle labels in degrees
ax.set_xticks(np.deg2rad(np.arange(0, 360, 45)))
ax.set_xticklabels(['0°', '45°', '90°', '135°', '180°', '225°', '270°', '315°'])

# Add title
plt.title('Distribution of Objects by Avg Heading')

# Display the plot
plt.show()
/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/geopandas/geodataframe.py:1538: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
_images/f92021ffa04f3343797ae8be2edb1a2f365f209d2442856c2bfac4367a8ee7d5.png

Calculating amount of objects in different directions for two-wheelers#

two_wheelers_heading_breaks = jenkspy.jenks_breaks(lines_gdf_two_wheelers['avg_heading'], 5)

# Создаем столбец для класса направления на основе разрывов
lines_gdf_two_wheelers['direction_class'] = pd.cut(lines_gdf_two_wheelers['avg_heading'], bins=two_wheelers_heading_breaks, labels=False, include_lowest=True)

# Считаем количество объектов в каждом классе направления
direction_class_counts = lines_gdf_two_wheelers['direction_class'].value_counts().sort_index()

print("Breaks:", cars_heading_breaks)
print("Direction Class Counts:\n", direction_class_counts)
# Creating Histogram
sns.histplot(data=lines_gdf_cars, x='avg_heading', hue='direction_class', bins=20, alpha=0.8, palette='BuPu', edgecolor='lightgray')

# Removing graph border
sns.despine()

# Adding titlle and axis names
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram, bins colored by class')

# Showing graph
Text(0.5, 1.0, 'Histogram, bins colored by class')
_images/6dcbfe9362ceda41dd9144d7de3dbe38872b09e62ada3fd47abeb71756553ef9.png
# Convert angles from degrees to radians for the polar plot
lines_gdf_cars['avg_heading_rad'] = np.deg2rad(lines_gdf_cars['avg_heading'])

# Create the polar plot
plt.figure(figsize=(10, 6))
ax = plt.subplot(111, projection='polar')

# Configure the polar plot: 0 degrees at the top and clockwise direction
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)

# Define the number of bins
num_bins = 8
bins = np.linspace(0, 2 * np.pi, num_bins + 1)

# Count the number of objects in each bin
hist, edges = np.histogram(lines_gdf_cars['avg_heading_rad'], bins=bins)

# Plot the histogram on the polar plot
bars = ax.bar(edges[:-1], hist, width=np.diff(edges), align='edge', edgecolor='black')

# Set angle labels in degrees
ax.set_xticks(np.deg2rad(np.arange(0, 360, 45)))
ax.set_xticklabels(['0°', '45°', '90°', '135°', '180°', '225°', '270°', '315°'])

# Add title
plt.title('Distribution of Objects by Avg Heading')

# Display the plot
plt.show()
_images/c8526fe3d4f0af61eced4f207ee4dfda22d2808bebc1a776d1b4d52984ba751c.png

Conclusion#